## Rows: 11,314
## Columns: 106
## $ id <dbl> 10595, 10990, 10993, 1...
## $ listing_url <chr> "https://www.airbnb.co...
## $ scrape_id <dbl> 2.02e+13, 2.02e+13, 2....
## $ last_scraped <date> 2020-06-17, 2020-06-1...
## $ name <chr> "96m2, 3BR, 2BA, Metro...
## $ summary <chr> "Athens Furnished Apar...
## $ space <chr> "Athens Furnished Apar...
## $ description <chr> "Athens Furnished Apar...
## $ experiences_offered <chr> "none", "none", "none"...
## $ neighborhood_overview <chr> "Ampelokipi district i...
## $ notes <chr> "Although is very easy...
## $ transit <chr> "Note: 5-day ticket fo...
## $ access <chr> "Guest have access to ...
## $ interaction <chr> "-Our reception is 10 ...
## $ house_rules <chr> "- Parties, meetings, ...
## $ thumbnail_url <lgl> NA, NA, NA, NA, NA, NA...
## $ medium_url <lgl> NA, NA, NA, NA, NA, NA...
## $ picture_url <chr> "https://a0.muscache.c...
## $ xl_picture_url <lgl> NA, NA, NA, NA, NA, NA...
## $ host_id <dbl> 37177, 37177, 37177, 3...
## $ host_url <chr> "https://www.airbnb.co...
## $ host_name <chr> "Emmanouil", "Emmanoui...
## $ host_since <date> 2009-09-08, 2009-09-0...
## $ host_location <chr> "Athens, Attica, Greec...
## $ host_about <chr> "Athens Quality Apartm...
## $ host_response_time <chr> "within an hour", "wit...
## $ host_response_rate <chr> "100%", "100%", "100%"...
## $ host_acceptance_rate <chr> "100%", "100%", "100%"...
## $ host_is_superhost <lgl> TRUE, TRUE, TRUE, TRUE...
## $ host_thumbnail_url <chr> "https://a0.muscache.c...
## $ host_picture_url <chr> "https://a0.muscache.c...
## $ host_neighbourhood <chr> "Ambelokipi", "Ambelok...
## $ host_listings_count <dbl> 6, 6, 6, 6, 6, 2, 1, 2...
## $ host_total_listings_count <dbl> 6, 6, 6, 6, 6, 2, 1, 2...
## $ host_verifications <chr> "['email', 'phone', 'r...
## $ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE...
## $ host_identity_verified <lgl> TRUE, TRUE, TRUE, TRUE...
## $ street <chr> "Athens, Attica, Greec...
## $ neighbourhood <chr> "Ambelokipi", "Ambelok...
## $ neighbourhood_cleansed <chr> "<U+0391><U+039C><U+03A0><U+0395><U+039B><U+039F><U+039A><U+0397><U+03A0><U+039F><U+0399>", "<U+0391><U+039C><U+03A0><U+0395><U+039B><U+039F>...
## $ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA...
## $ city <chr> "Athens", "Athens", "A...
## $ state <chr> "Attica", "Attica", "A...
## $ zipcode <chr> "11526", "11526", "115...
## $ market <chr> "Athens", "Athens", "A...
## $ smart_location <chr> "Athens, Greece", "Ath...
## $ country_code <chr> "GR", "GR", "GR", "GR"...
## $ country <chr> "Greece", "Greece", "G...
## $ latitude <dbl> 38, 38, 38, 38, 38, 38...
## $ longitude <dbl> 23.8, 23.8, 23.8, 23.8...
## $ is_location_exact <lgl> TRUE, TRUE, TRUE, TRUE...
## $ property_type <chr> "Apartment", "Apartmen...
## $ room_type <chr> "Entire home/apt", "En...
## $ accommodates <dbl> 8, 4, 2, 4, 4, 4, 1, 5...
## $ bathrooms <dbl> 2.0, 1.0, 1.0, 1.0, 1....
## $ bedrooms <dbl> 3, 1, 0, 1, 1, 1, 1, 2...
## $ beds <dbl> 5, 1, 1, 2, 1, 2, 1, 2...
## $ bed_type <chr> "Real Bed", "Real Bed"...
## $ amenities <chr> "{TV,\"Cable TV\",Inte...
## $ square_feet <dbl> 1076, NA, NA, NA, NA, ...
## $ price <chr> "$122.00", "$45.00", "...
## $ weekly_price <chr> "$700.00", "$420.00", ...
## $ monthly_price <chr> "$2,800.00", "$1,680.0...
## $ security_deposit <chr> "$0.00", "$0.00", "$0....
## $ cleaning_fee <chr> "$25.00", "$15.00", "$...
## $ guests_included <dbl> 4, 2, 2, 2, 2, 2, 1, 1...
## $ extra_people <chr> "$13.00", "$5.00", "$0...
## $ minimum_nights <dbl> 1, 1, 1, 1, 1, 2, 1, 5...
## $ maximum_nights <dbl> 45, 60, 60, 60, 30, 73...
## $ minimum_minimum_nights <dbl> 1, 1, 1, 1, 1, 2, 1, 5...
## $ maximum_minimum_nights <dbl> 4, 4, 4, 4, 4, 2, 1, 5...
## $ minimum_maximum_nights <dbl> 45, 60, 60, 60, 30, 11...
## $ maximum_maximum_nights <dbl> 45, 60, 60, 60, 30, 11...
## $ minimum_nights_avg_ntm <dbl> 1.3, 1.4, 1.8, 1.5, 1....
## $ maximum_nights_avg_ntm <dbl> 45, 60, 60, 60, 30, 11...
## $ calendar_updated <chr> "2 weeks ago", "2 week...
## $ has_availability <lgl> TRUE, TRUE, TRUE, TRUE...
## $ availability_30 <dbl> 30, 13, 13, 17, 12, 30...
## $ availability_60 <dbl> 60, 43, 43, 47, 42, 60...
## $ availability_90 <dbl> 90, 73, 72, 77, 69, 83...
## $ availability_365 <dbl> 365, 271, 347, 275, 26...
## $ calendar_last_scraped <date> 2020-06-17, 2020-06-1...
## $ number_of_reviews <dbl> 25, 34, 48, 21, 17, 45...
## $ number_of_reviews_ltm <dbl> 5, 3, 1, 2, 1, 31, 0, ...
## $ first_review <date> 2011-05-20, 2012-09-0...
## $ last_review <date> 2020-03-15, 2020-01-0...
## $ review_scores_rating <dbl> 97, 98, 97, 96, 95, 96...
## $ review_scores_accuracy <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_cleanliness <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_checkin <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_communication <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_location <dbl> 9, 10, 10, 9, 9, 10, N...
## $ review_scores_value <dbl> 10, 10, 10, 10, 9, 10,...
## $ requires_license <lgl> TRUE, TRUE, TRUE, TRUE...
## $ license <chr> "478825", "400315", "4...
## $ jurisdiction_names <lgl> NA, NA, NA, NA, NA, NA...
## $ instant_bookable <lgl> TRUE, TRUE, TRUE, TRUE...
## $ is_business_travel_ready <lgl> FALSE, FALSE, FALSE, F...
## $ cancellation_policy <chr> "moderate", "moderate"...
## $ require_guest_profile_picture <lgl> FALSE, FALSE, FALSE, F...
## $ require_guest_phone_verification <lgl> FALSE, FALSE, FALSE, F...
## $ calculated_host_listings_count <dbl> 6, 6, 6, 6, 6, 2, 1, 2...
## $ calculated_host_listings_count_entire_homes <dbl> 6, 6, 6, 6, 6, 2, 0, 2...
## $ calculated_host_listings_count_private_rooms <dbl> 0, 0, 0, 0, 0, 0, 1, 0...
## $ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0...
## $ reviews_per_month <dbl> 0.23, 0.36, 0.51, 0.17...
skim(athens_data)| Name | athens_data |
| Number of rows | 11314 |
| Number of columns | 106 |
| _______________________ | |
| Column type frequency: | |
| character | 47 |
| Date | 5 |
| logical | 15 |
| numeric | 39 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| listing_url | 0 | 1.00 | 34 | 37 | 0 | 11314 | 0 |
| name | 7 | 1.00 | 1 | 98 | 0 | 11114 | 0 |
| summary | 314 | 0.97 | 1 | 1000 | 0 | 10140 | 0 |
| space | 2804 | 0.75 | 1 | 1000 | 0 | 7793 | 0 |
| description | 176 | 0.98 | 1 | 1000 | 0 | 10600 | 0 |
| experiences_offered | 0 | 1.00 | 4 | 4 | 0 | 1 | 0 |
| neighborhood_overview | 3139 | 0.72 | 1 | 1000 | 0 | 6611 | 0 |
| notes | 6750 | 0.40 | 1 | 1000 | 0 | 3465 | 0 |
| transit | 3403 | 0.70 | 1 | 1000 | 0 | 6470 | 0 |
| access | 5447 | 0.52 | 1 | 1000 | 0 | 4552 | 0 |
| interaction | 4427 | 0.61 | 1 | 1000 | 0 | 5010 | 0 |
| house_rules | 5782 | 0.49 | 2 | 1000 | 0 | 4155 | 0 |
| picture_url | 0 | 1.00 | 81 | 146 | 0 | 11201 | 0 |
| host_url | 0 | 1.00 | 39 | 43 | 0 | 6272 | 0 |
| host_name | 1 | 1.00 | 1 | 33 | 0 | 2650 | 0 |
| host_location | 17 | 1.00 | 2 | 94 | 0 | 575 | 0 |
| host_about | 4823 | 0.57 | 1 | 4636 | 0 | 3036 | 9 |
| host_response_time | 1 | 1.00 | 3 | 18 | 0 | 5 | 0 |
| host_response_rate | 1 | 1.00 | 2 | 4 | 0 | 47 | 0 |
| host_acceptance_rate | 1 | 1.00 | 2 | 4 | 0 | 76 | 0 |
| host_thumbnail_url | 1 | 1.00 | 55 | 106 | 0 | 6249 | 0 |
| host_picture_url | 1 | 1.00 | 57 | 109 | 0 | 6249 | 0 |
| host_neighbourhood | 1826 | 0.84 | 4 | 29 | 0 | 66 | 0 |
| host_verifications | 0 | 1.00 | 2 | 147 | 0 | 218 | 0 |
| street | 0 | 1.00 | 10 | 62 | 0 | 274 | 0 |
| neighbourhood | 1 | 1.00 | 4 | 17 | 0 | 32 | 0 |
| neighbourhood_cleansed | 0 | 1.00 | 4 | 32 | 0 | 45 | 0 |
| city | 4 | 1.00 | 2 | 30 | 0 | 94 | 0 |
| state | 10261 | 0.09 | 1 | 38 | 0 | 132 | 0 |
| zipcode | 262 | 0.98 | 5 | 12 | 0 | 212 | 0 |
| market | 126 | 0.99 | 6 | 21 | 0 | 2 | 0 |
| smart_location | 0 | 1.00 | 10 | 39 | 0 | 102 | 0 |
| country_code | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| country | 0 | 1.00 | 6 | 6 | 0 | 1 | 0 |
| property_type | 0 | 1.00 | 4 | 23 | 0 | 26 | 0 |
| room_type | 0 | 1.00 | 10 | 15 | 0 | 4 | 0 |
| bed_type | 0 | 1.00 | 5 | 13 | 0 | 5 | 0 |
| amenities | 0 | 1.00 | 2 | 1646 | 0 | 10540 | 0 |
| price | 0 | 1.00 | 5 | 9 | 0 | 279 | 0 |
| weekly_price | 10790 | 0.05 | 6 | 9 | 0 | 148 | 0 |
| monthly_price | 10817 | 0.04 | 7 | 10 | 0 | 156 | 0 |
| security_deposit | 2984 | 0.74 | 5 | 9 | 0 | 94 | 0 |
| cleaning_fee | 1852 | 0.84 | 5 | 7 | 0 | 85 | 0 |
| extra_people | 0 | 1.00 | 5 | 7 | 0 | 39 | 0 |
| calendar_updated | 0 | 1.00 | 5 | 13 | 0 | 71 | 0 |
| license | 3949 | 0.65 | 4 | 140 | 0 | 6735 | 0 |
| cancellation_policy | 0 | 1.00 | 8 | 27 | 0 | 6 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| last_scraped | 0 | 1.00 | 2020-06-16 | 2020-06-18 | 2020-06-16 | 3 |
| host_since | 1 | 1.00 | 2009-09-08 | 2020-06-11 | 2017-03-06 | 2423 |
| calendar_last_scraped | 0 | 1.00 | 2020-06-16 | 2020-06-18 | 2020-06-16 | 3 |
| first_review | 2677 | 0.76 | 2010-07-08 | 2020-06-17 | 2018-09-07 | 1954 |
| last_review | 2677 | 0.76 | 2013-05-23 | 2020-06-17 | 2020-01-29 | 985 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| thumbnail_url | 11314 | 0 | NaN | : |
| medium_url | 11314 | 0 | NaN | : |
| xl_picture_url | 11314 | 0 | NaN | : |
| host_is_superhost | 1 | 1 | 0.38 | FAL: 7034, TRU: 4279 |
| host_has_profile_pic | 1 | 1 | 1.00 | TRU: 11287, FAL: 26 |
| host_identity_verified | 1 | 1 | 0.19 | FAL: 9217, TRU: 2096 |
| neighbourhood_group_cleansed | 11314 | 0 | NaN | : |
| is_location_exact | 0 | 1 | 0.92 | TRU: 10370, FAL: 944 |
| has_availability | 0 | 1 | 1.00 | TRU: 11314 |
| requires_license | 0 | 1 | 1.00 | TRU: 11314 |
| jurisdiction_names | 11314 | 0 | NaN | : |
| instant_bookable | 0 | 1 | 0.75 | TRU: 8528, FAL: 2786 |
| is_business_travel_ready | 0 | 1 | 0.00 | FAL: 11314 |
| require_guest_profile_picture | 0 | 1 | 0.01 | FAL: 11247, TRU: 67 |
| require_guest_phone_verification | 0 | 1 | 0.01 | FAL: 11201, TRU: 113 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 2.74e+07 | 1.09e+07 | 1.06e+04 | 2.01e+07 | 2.90e+07 | 3.60e+07 | 4.38e+07 | ▂▃▆▇▇ |
| scrape_id | 0 | 1.00 | 2.02e+13 | 0.00e+00 | 2.02e+13 | 2.02e+13 | 2.02e+13 | 2.02e+13 | 2.02e+13 | ▁▁▇▁▁ |
| host_id | 0 | 1.00 | 1.29e+08 | 9.79e+07 | 3.72e+04 | 3.65e+07 | 1.19e+08 | 2.12e+08 | 3.50e+08 | ▇▅▃▅▂ |
| host_listings_count | 1 | 1.00 | 1.68e+01 | 5.52e+01 | 0.00e+00 | 1.00e+00 | 2.00e+00 | 9.00e+00 | 1.12e+03 | ▇▁▁▁▁ |
| host_total_listings_count | 1 | 1.00 | 1.68e+01 | 5.52e+01 | 0.00e+00 | 1.00e+00 | 2.00e+00 | 9.00e+00 | 1.12e+03 | ▇▁▁▁▁ |
| latitude | 0 | 1.00 | 3.80e+01 | 1.00e-02 | 3.80e+01 | 3.80e+01 | 3.80e+01 | 3.80e+01 | 3.80e+01 | ▃▇▆▂▁ |
| longitude | 0 | 1.00 | 2.37e+01 | 1.00e-02 | 2.37e+01 | 2.37e+01 | 2.37e+01 | 2.37e+01 | 2.38e+01 | ▁▇▇▂▁ |
| accommodates | 0 | 1.00 | 3.90e+00 | 2.04e+00 | 1.00e+00 | 2.00e+00 | 4.00e+00 | 5.00e+00 | 3.00e+01 | ▇▁▁▁▁ |
| bathrooms | 1 | 1.00 | 1.21e+00 | 5.10e-01 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 1.20e+01 | ▇▁▁▁▁ |
| bedrooms | 10 | 1.00 | 1.40e+00 | 8.60e-01 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 1.00e+01 | ▇▁▁▁▁ |
| beds | 46 | 1.00 | 2.14e+00 | 1.52e+00 | 0.00e+00 | 1.00e+00 | 2.00e+00 | 3.00e+00 | 4.00e+01 | ▇▁▁▁▁ |
| square_feet | 11226 | 0.01 | 6.68e+02 | 5.47e+02 | 0.00e+00 | 2.69e+02 | 5.92e+02 | 1.08e+03 | 2.79e+03 | ▇▅▃▁▁ |
| guests_included | 0 | 1.00 | 1.92e+00 | 1.25e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 1.60e+01 | ▇▁▁▁▁ |
| minimum_nights | 0 | 1.00 | 4.26e+00 | 2.18e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 1.00e+03 | ▇▁▁▁▁ |
| maximum_nights | 0 | 1.00 | 1.63e+03 | 9.40e+04 | 1.00e+00 | 6.20e+01 | 1.12e+03 | 1.12e+03 | 1.00e+07 | ▇▁▁▁▁ |
| minimum_minimum_nights | 0 | 1.00 | 3.96e+00 | 1.89e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 1.00e+03 | ▇▁▁▁▁ |
| maximum_minimum_nights | 0 | 1.00 | 5.36e+00 | 3.31e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 1.00e+03 | ▇▁▁▁▁ |
| minimum_maximum_nights | 0 | 1.00 | 8.80e+02 | 4.54e+02 | 1.00e+00 | 1.00e+03 | 1.12e+03 | 1.12e+03 | 1.00e+04 | ▇▁▁▁▁ |
| maximum_maximum_nights | 0 | 1.00 | 8.90e+02 | 4.46e+02 | 1.00e+00 | 1.12e+03 | 1.12e+03 | 1.12e+03 | 1.00e+04 | ▇▁▁▁▁ |
| minimum_nights_avg_ntm | 0 | 1.00 | 4.37e+00 | 2.17e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 1.00e+03 | ▇▁▁▁▁ |
| maximum_nights_avg_ntm | 0 | 1.00 | 8.84e+02 | 4.48e+02 | 1.00e+00 | 1.03e+03 | 1.12e+03 | 1.12e+03 | 1.00e+04 | ▇▁▁▁▁ |
| availability_30 | 0 | 1.00 | 1.92e+01 | 1.22e+01 | 0.00e+00 | 4.00e+00 | 2.60e+01 | 3.00e+01 | 3.00e+01 | ▃▁▁▁▇ |
| availability_60 | 0 | 1.00 | 4.00e+01 | 2.35e+01 | 0.00e+00 | 1.80e+01 | 5.20e+01 | 5.90e+01 | 6.00e+01 | ▃▁▁▂▇ |
| availability_90 | 0 | 1.00 | 6.16e+01 | 3.45e+01 | 0.00e+00 | 4.10e+01 | 7.90e+01 | 8.90e+01 | 9.00e+01 | ▃▁▁▂▇ |
| availability_365 | 0 | 1.00 | 2.32e+02 | 1.33e+02 | 0.00e+00 | 1.21e+02 | 2.74e+02 | 3.58e+02 | 3.65e+02 | ▃▂▂▂▇ |
| number_of_reviews | 0 | 1.00 | 3.52e+01 | 6.12e+01 | 0.00e+00 | 1.00e+00 | 9.00e+00 | 4.20e+01 | 7.11e+02 | ▇▁▁▁▁ |
| number_of_reviews_ltm | 0 | 1.00 | 1.11e+01 | 1.71e+01 | 0.00e+00 | 0.00e+00 | 3.00e+00 | 1.60e+01 | 1.57e+02 | ▇▁▁▁▁ |
| review_scores_rating | 2739 | 0.76 | 9.53e+01 | 7.04e+00 | 2.00e+01 | 9.40e+01 | 9.70e+01 | 1.00e+02 | 1.00e+02 | ▁▁▁▁▇ |
| review_scores_accuracy | 2751 | 0.76 | 9.76e+00 | 6.80e-01 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_cleanliness | 2751 | 0.76 | 9.66e+00 | 7.40e-01 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_checkin | 2753 | 0.76 | 9.85e+00 | 5.50e-01 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_communication | 2752 | 0.76 | 9.84e+00 | 5.70e-01 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_location | 2753 | 0.76 | 9.57e+00 | 7.60e-01 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_value | 2754 | 0.76 | 9.62e+00 | 7.40e-01 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| calculated_host_listings_count | 0 | 1.00 | 9.98e+00 | 2.14e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 7.00e+00 | 1.38e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_entire_homes | 0 | 1.00 | 7.57e+00 | 1.52e+01 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 6.00e+00 | 8.50e+01 | ▇▁▁▁▁ |
| calculated_host_listings_count_private_rooms | 0 | 1.00 | 2.14e+00 | 1.27e+01 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 1.12e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_shared_rooms | 0 | 1.00 | 5.00e-02 | 6.40e-01 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 1.20e+01 | ▇▁▁▁▁ |
| reviews_per_month | 2677 | 0.76 | 1.65e+00 | 1.74e+00 | 1.00e-02 | 3.40e-01 | 1.00e+00 | 2.42e+00 | 1.29e+01 | ▇▂▁▁▁ |
Based on our initial data analysis we identified 4 major types of variables in the underlying data set:
We also have seen that we have 11,314 observations (apartments) & a total of 106 data points per apartment.
We identified many variables that have a characteristic which make it either not interesting to analyze (only one/ very few distinct values, text strings) or that we think we will not use in the analysis later on.
So we excluded these columns/ data points in order to make the data easier & faster to handle.
athens_data_red <- athens_data %>%
#Select the relevant variables
select(
id,
neighbourhood,
zipcode,
latitude,
longitude,
property_type,
room_type,
accommodates,
bathrooms,
bedrooms,
beds,
price,
weekly_price,
monthly_price,
security_deposit,
cleaning_fee,
guests_included,
extra_people,
minimum_nights,
maximum_nights,
availability_365,
number_of_reviews_ltm,
review_scores_rating,
review_scores_checkin,
review_scores_cleanliness,
review_scores_accuracy,
review_scores_communication,
review_scores_location,
review_scores_value,
cancellation_policy,
reviews_per_month,
host = host_id,
host_response_time,
host_response_rate,
host_acceptance_rate,
host_is_superhost,
host_listings_count,
host_total_listings_count,
host_identity_verified,number_of_reviews,
host_instant_booking = instant_bookable
)We now only have 41 columns left, which make the data set easier to handle.
In a next step we will adjust the type of some variables so that we can actually can work with the data more easily.
# Transform character values to numeric values
athens_data_clean <- athens_data_red %>%
mutate(
price = as.numeric(str_remove_all(price, "[$ ,]")),
weekly_price = as.numeric(str_remove_all(weekly_price, "[$ , ]")),
monthly_price = as.numeric(str_remove_all(monthly_price, "[$ ,]")),
cleaning_fee = as.numeric(str_remove_all(cleaning_fee, "[$ ,]")),
security_deposit = as.numeric(str_remove_all(security_deposit, "[$ ,]")),
extra_people = as.numeric(str_remove_all(extra_people, "[$ ,]")),
host_response_rate = as.numeric(str_remove_all(cleaning_fee, "[% ,]")),
host_acceptance_rate = as.numeric(str_remove_all(cleaning_fee, "[% ,]"))
)# Create factor variables for room types
room_types <- unique(athens_data_clean$room_type)
athens_data_clean$room_type <- factor(athens_data_clean$room_type, labels = room_types)
# Create factor variables for cancellation policies
cancellation_policies <- unique(athens_data_clean$cancellation_policy)
athens_data_clean$cancellation_policy <- factor(athens_data_clean$cancellation_policy, labels = cancellation_policies)
# Create factor variables for host response time
athens_data_clean <- athens_data_clean %>%
mutate(host_response_time = fct_relevel(host_response_time,
"within an hour",
"within a few hours",
"within a day",
"a few days or more",
))The issue with the property types is that there are to much in order to generate reasonable factors. We need to analyze how much the share of each category. Best case would be that the majority of the property type share is done with a small number. If that is the case we can just summarize the rest in a new category calles "other".
# Identify the amount of each property type
most_com_properties <- athens_data_clean %>%
count(property_type) %>%
mutate(percentage = n/sum(n)*100)%>%
arrange(desc(n))
most_com_properties## # A tibble: 26 x 3
## property_type n percentage
## <chr> <int> <dbl>
## 1 Apartment 9677 85.5
## 2 House 386 3.41
## 3 Condominium 261 2.31
## 4 Serviced apartment 187 1.65
## 5 Loft 180 1.59
## 6 Aparthotel 139 1.23
## 7 Hotel 135 1.19
## 8 Boutique hotel 120 1.06
## 9 Bed and breakfast 49 0.433
## 10 Hostel 38 0.336
## # ... with 16 more rows
As the 5 most common property types account for ~95% of the total share we can just focus on them and summarize the rest in "Others"
# First we need to summarize the other values in the Category "Others"
athens_data_clean <- athens_data_clean %>%
mutate(
property_type = case_when(
property_type %in% c("Apartment","House", "Condominium","Serviced Apartment", "Loft")
~ property_type,
TRUE ~ "Other"))
# In a next step we can make a factor out of the 6 pre-defined categories
athens_data_clean <- athens_data_clean %>%
mutate(
property_type = fct_relevel(property_type,
"Apartment",
"House",
"Condominium",
"Serviced Apartment",
"Loft",
"Other"))We now have transformed the data types of the most variables in order to make the data set even cleaner. We have deleted unnecessary values, adjusted wrong variable types and now we will further inspect the quality of our data.
In a this step we will further manipulate the data set. In specific we will correct the NA values in cases in which we can estimate the value.
# We will replace the NAs in the weekly prices and assume there is no discount if NA
athens_data_clean$weekly_price[is.na(athens_data_clean$weekly_price)] <-
athens_data_clean$price *7
# We will replace the NAs in the monthly prices and assume there is no discount if NA
athens_data_clean$monthly_price[is.na(athens_data_clean$monthly_price)] <-
athens_data_clean$price * 30
# We will replace the NAs in the security deposit & cleaning fee and assume 0 if NA
athens_data_clean$cleaning_fee[is.na(athens_data_clean$cleaning_fee)] <- 0
athens_data_clean$security_deposit[is.na(athens_data_clean$security_deposit)] <- 0We now have cleaned the data to a nearly perfect amount. The only thing we haven't yet included are outliers which will be captured in the next paragraph.
We will screen the most important variable price, which we need in our analysis later on, for potential outliers. We will exclude the extreme values, which make no sense economically (way too high prices). Reasons which could explain these extremly high prices are unwillingness to list at the moment, fake listings or extremly luxurious apartments.
# Quick plot to see outliers
athens_data_clean %>%
ggplot(aes(x = price)) +
geom_histogram() +
labs(title= "Distribution of prices in our original data")# Looks very scewed, probably a log-normal distribution, use log -> normal
athens_data_clean %>%
ggplot(aes(x = log(price))) +
geom_histogram()# There seem to be a few outliers. We will remove them using the IQR method, becauses we belive that keeping those values would skew our analysis
IQR.outliers <- function(x) {
Q3 <- quantile(x,0.95)
Q1 <- quantile(x,0.05)
IQR <- (Q3-Q1)
left <- (Q1-(1.5*IQR))
right <- (Q3+(1.5*IQR))
print(c(left, right))
c(x[x <left],x[x>right])
}
# Print outliers
IQR.outliers(athens_data_clean$price)## 5% 95%
## -180 352
## [1] 354 600 459 400 400 385 1000 515 410 640 412 502 650 400 400
## [16] 600 450 1000 1000 1000 495 1000 400 500 500 500 500 500 500 525
## [31] 404 800 700 500 402 450 450 540 360 810 487 7000 7000 7000 7000
## [46] 390 460 400 600 400 353 357 400 426 500 1500 900 1200 450 450
## [61] 800 400 990 600 1000 500 1000 5000 400 800 360 1000 390 500 500
## [76] 400 400 400 600 1290 999 1000 700 720 700 700 1000
athens_data_clean %>%
filter(!(price %in% IQR.outliers(athens_data_clean$price))) %>%
ggplot(aes(x = log(price))) +
geom_histogram()## 5% 95%
## -180 352
#Defining our final data set, which has no more outliers
athens_data_final <- athens_data_clean %>%
filter(!(price %in% IQR.outliers(athens_data_clean$price)))## 5% 95%
## -180 352
As we now have finally derived with a data set, which has only the relevant values, right variable types, adjusted NA values and is corrected for outliers, we can finally start with the analysis of the data.
# Plot our airbnbs
qmplot(longitude, latitude, data = athens_data_final, color = price)# Syntagma coordinates
syntagma <- c(37.975344, 23.73472)
names(syntagma) <- c("longitude", "latitude")
# Athene map
athens_map = get_map(location=c(23.68,
37.945,
23.8,
38.035), maptype="terrain-background")
athens_map <- ggmap(athens_map)
# We dont want to see the axis when we are ploting maps
map_theme <- theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
# Plot the map and Syntagma, is there a connection between prices and the centre?
athens_map +
geom_point(data=athens_data_final, aes(x = longitude, y = latitude, color = price)) +
geom_point(aes(x = syntagma['latitude'], syntagma['longitude']),
color = 'red', size = 5) +
theme_minimal() +
map_theme +
labs(title="Airbnbs around the centre seem to be more expensive",
subtitle = "Centre - Syntagma Square")# Now that we assume that there is a connection, we calculate the distance for each airbnb
head(athens_data_final)## # A tibble: 6 x 41
## id neighbourhood zipcode latitude longitude property_type room_type
## <dbl> <chr> <chr> <dbl> <dbl> <fct> <fct>
## 1 10595 Ambelokipi 11526 38.0 23.8 Apartment Entire h~
## 2 10990 Ambelokipi 11526 38.0 23.8 Apartment Entire h~
## 3 10993 Ambelokipi 115 26 38.0 23.8 Apartment Entire h~
## 4 10995 Ambelokipi 11526 38.0 23.8 Apartment Entire h~
## 5 27262 Ambelokipi 11526 38.0 23.8 Apartment Entire h~
## 6 28186 Plaka 105 63 38.0 23.7 Loft Entire h~
## # ... with 34 more variables: accommodates <dbl>, bathrooms <dbl>,
## # bedrooms <dbl>, beds <dbl>, price <dbl>, weekly_price <dbl>,
## # monthly_price <dbl>, security_deposit <dbl>, cleaning_fee <dbl>,
## # guests_included <dbl>, extra_people <dbl>, minimum_nights <dbl>,
## # maximum_nights <dbl>, availability_365 <dbl>, number_of_reviews_ltm <dbl>,
## # review_scores_rating <dbl>, review_scores_checkin <dbl>,
## # review_scores_cleanliness <dbl>, review_scores_accuracy <dbl>,
## # review_scores_communication <dbl>, review_scores_location <dbl>,
## # review_scores_value <dbl>, cancellation_policy <fct>,
## # reviews_per_month <dbl>, host <dbl>, host_response_time <fct>,
## # host_response_rate <dbl>, host_acceptance_rate <dbl>,
## # host_is_superhost <lgl>, host_listings_count <dbl>,
## # host_total_listings_count <dbl>, host_identity_verified <lgl>,
## # number_of_reviews <dbl>, host_instant_booking <lgl>
# Calculate the distance
athens_data_final<- athens_data_final %>%
rowwise() %>%
mutate(
cent_dist = distm(c(latitude, longitude), c(37.975344, 23.73472),
fun = distHaversine)[1,1]
)
# Test if our numbers are correct visually
athens_map +
geom_point(data=athens_data_final, aes(x = longitude, y = latitude, color = cent_dist)) +
geom_point(aes(x = syntagma['latitude'], syntagma['longitude']), color = 'red', size = 5) +
theme_minimal() +
map_theme# How many room types are there?
length(unique(athens_data_final$room_type))## [1] 4
avg_dist <- athens_data_final %>%
group_by(neighbourhood) %>%
summarise(
avg_dist = mean(cent_dist)
) %>%
arrange(-avg_dist)
athens_data_final %>%
filter(!is.na(neighbourhood)) %>%
select(neighbourhood,
room_type) %>%
group_by(neighbourhood,
room_type) %>%
summarise(n = n()) %>%
mutate(perc = n/sum(n)) %>%
ggplot(aes(fill=room_type, x=perc, y=factor(neighbourhood,levels = avg_dist$neighbourhood))) +
geom_bar(position="fill", stat="identity") +
labs(title="Average distance from the centre does not seem to impact room types",
subtitle = "Average distance in decreasing order") +
ylab("") +
xlab("") +
guides(fill=guide_legend(title="Room types"))We will try to find the best fitting model to predict per night prices
athens_data_final %>%
na.omit() %>%
select_if(is.numeric) %>%
cor() %>%
as.data.frame() %>%
select(price) %>%
add_rownames(var = "variable") %>%
arrange(price) %>%
ggplot(aes(x = price, y = reorder(variable, price))) +
geom_col() +
ylab("") +
xlab("Correlation") +
labs(title = "Distance from central is the most negative correlation",
subtitle = "Correlations with price")athens_data_final %>%
ggplot(aes(x=cent_dist, y=price)) +
geom_point()Using correlations doesnt seem to work too well, we will need to find another way
# First we will split our data into a training and testing set
# Set seed so we will get the same results
set.seed(202119)
size <- floor(0.75 * nrow(athens_data_final))
train_ind <- sample(seq_len(nrow(athens_data_final)), size = size)
train <- athens_data_final[train_ind, ]
test <- athens_data_final[-train_ind, ]OLS
library(stats)
# To choose a model we will use Akaike's information criterion
# Univariate regression
model1 <- lm(log(price) ~ as.factor(accommodates), data=train)
# Are airbnbs that accomodate 8 people necessarily 2 times as expensive? We do not think so, therefore we use factors instead.
summary(model1) ##
## Call:
## lm(formula = log(price) ~ as.factor(accommodates), data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5461 -0.3660 -0.0536 0.2873 2.5607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9608 0.0316 93.63 <2e-16 ***
## as.factor(accommodates)2 0.6837 0.0336 20.32 <2e-16 ***
## as.factor(accommodates)3 0.8064 0.0351 22.96 <2e-16 ***
## as.factor(accommodates)4 0.9811 0.0332 29.53 <2e-16 ***
## as.factor(accommodates)5 1.0790 0.0372 28.98 <2e-16 ***
## as.factor(accommodates)6 1.2585 0.0365 34.46 <2e-16 ***
## as.factor(accommodates)7 1.3169 0.0522 25.21 <2e-16 ***
## as.factor(accommodates)8 1.5177 0.0486 31.20 <2e-16 ***
## as.factor(accommodates)9 1.5841 0.0834 18.99 <2e-16 ***
## as.factor(accommodates)10 1.6629 0.0819 20.29 <2e-16 ***
## as.factor(accommodates)11 1.7005 0.1402 12.13 <2e-16 ***
## as.factor(accommodates)12 1.8879 0.1125 16.77 <2e-16 ***
## as.factor(accommodates)13 1.2676 0.2025 6.26 4e-10 ***
## as.factor(accommodates)14 1.8410 0.1402 13.13 <2e-16 ***
## as.factor(accommodates)15 2.0437 0.2025 10.09 <2e-16 ***
## as.factor(accommodates)16 1.9380 0.1049 18.48 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.529 on 8404 degrees of freedom
## Multiple R-squared: 0.235, Adjusted R-squared: 0.234
## F-statistic: 173 on 15 and 8404 DF, p-value: <2e-16
summary(model1)$r.squared # R2 same as correlation squared, because of univariate model## [1] 0.235
# cor(log(train$price), train$accommodates)^2
model1 %>% AIC() # 13266## [1] 13195
lm(log(price) ~ accommodates + bedrooms, data=train)##
## Call:
## lm(formula = log(price) ~ accommodates + bedrooms, data = train)
##
## Coefficients:
## (Intercept) accommodates bedrooms
## 3.3370 0.1107 0.0838
It would be interesting to use sentiment analysis and use it for pricing
from textblob import TextBlob
testimonial = TextBlob("I love cupcakes.")
print(testimonial.sentiment.polarity)## 0.5
a = testimonial.sentiment.polaritypy$testimonial## I love cupcakes.
py$a## [1] 0.5